/*******************************************************************************

  Paying Outsourced Labor: Direct Evidence from Linked Temp Agency-Worker-Client Data

  By Andres Drenik, Simon Jäger, Pascuel Plotkin and Benjamin Schoefer
	January 7th, 2021

	DESCRIPTION: Generates a dataset that indicates the share of CBA coverage by 4-digit industry.

*******************************************************************************/


/*******************************************************************************
***** Preliminaries
*******************************************************************************/
set more off
cap log close
local curr_date = c(current_date)
log using "${logs}/11_CBA_Coverage`curr_date'", replace


/*******************************************************************************
*User Settings
*******************************************************************************/

*Dataset to use
global dataset = "newSIPA" /*SIPA: SIPA at ministry, newSIPA: new SIPA at ministry with convencionados variable */

**Stats to Estimate**

*Build Prepped_Dataset
global build_prepped = 1

*Convencionados Panel
global convencionados_panel = 1


****************************************************************
*Looping over 2009 - 2017 to filter each year's SIPA Dataset
****************************************************************

if $build_prepped == 1{

    forvalues  y = 2009/2017{

        if "$dataset" == "newSIPA" local newsipa_vars = "convencionado"

        use date cuil_trab cuit_empl remuner_total gender public_worker modalidad ciiu_4 DOB sac sueldo temp_duplicate `newsipa_vars' using "$XXX/Argentina_Clean_`y'", clear

        *Count Firms and Workers
        bys cuil_trab: gen worker_count = _n == 1
        bys cuit_empl: gen firm_count = _n == 1
        gcollapse (sum) worker_count firm_count, merge replace fast

        drop worker_count firm_count

        *Keep Men:
        qui count
        local n_before = `r(N)'
        keep if gender == 3
        qui count
        local n_gender = `n_before' - `r(N)'
        drop gender

        gen age = date - DOB
        drop DOB

        drop if age < 25*12
        drop if age > 65*12

        qui count
        local n_age = `n_before' - `n_gender' - `r(N)'

        replace age = age - 25*12 + 1

        *Erase Temp duplicates
        drop if temp_duplicate == 1
        drop temp_duplicate

        replace remuner_total = . if public_worker == 1

        *Get rid of aguinaldo:
        gen daily    = dofm(date)
        gen semester = halfyear(daily)
        gen month    = month(daily)

        bys cuil_trab date (remuner_total): keep if _n == _N

        *Count Months Worked over semester:
        bys cuil_trab cuit_empl semester: gegen months_worked = count(remuner_total)

        *Issue: In the last month we don't observe the wage but wage + aguinaldo. So in order to determine max wage:
        gen wage = remuner_total

        *If Wage in month 6/12 is the highest in the semester, then it would be:
        replace wage = remuner_total/(1 + months_worked/12) if inlist(month,6,12)
        bys cuil_trab cuit_empl semester: gegen max_wage_over_semester = max(wage)
        gen aguinaldo = 0
        replace aguinaldo = max_wage_over_semester*months_worked/12 if inlist(month, 6, 12)
        drop months_worked max_wage_over_semester
        replace wage = remuner_total - aguinaldo if aguinaldo < remuner_total

        merge m:1 date using "$input/ArgentinaPriceLevel.dta", keepusing(index) keep(matched) nogen

        replace wage = wage/index

        merge m:1 date using "${intermediate_data_clea}/WageThreshold.dta" , keep(matched) nogen

        winsor2 wage, replace c(0 99.999)

        gen below_threshold = wage < 0.5*threshold

        drop threshold

        gen log_wage = log(wage)
        drop wage
        rename log_wage wage

        *Save
        keep date cuil_trab cuit_empl wage ciiu_4 below_threshold `newsipa_vars' public_worker
        compress
        save "${intermediate_data_emp}/Prepped_Dataset_`y'.dta", replace

    }

    clear

    forvalues y = 2009/2017 {

            append using "${intermediate_data_emp}/Prepped_Dataset_`y'.dta"
            erase "${intermediate_data_emp}/Prepped_Dataset_`y'.dta"

    }

    xtset cuil_trab date

    tsspell cuit_empl

    *We are not counting first and last obs in spell as below threshold.
    gen first_or_last_obs =  _seq == 1 | _end == 1

    replace below_threshold = 0 if first_or_last_obs == 1

    replace wage  = . if below_threshold == 1 | first_or_last_obs == 1

    drop _seq _end

    compress

    save "${intermediate_data_emp}/Prepped_Dataset.dta", replace

}


if $convencionados_panel == 1 {

    use ciiu_4 convencionado date cuit_empl public_worker below_threshold using "${intermediate_data_emp}/Prepped_Dataset.dta", clear

    drop if public_worker == 1
    drop if below_threshold == 1

    drop public_worker below_threshold first_or_last_obs

    gen worker = 1

    gen conv_equal_1 = 1 if (convencionado == 1)

    gen conv_missing = 1 if (convencionado == .)

    gen conv_equal_0 = 1 if (convencionado == 0)

    gen conv_equal_other = 1 if (convencionado != 1 & convencionado != 0 & convencionado != .)

    gcollapse (sum) workers_in_firm = worker, by(date cuit_empl) fast merge replace unsorted

    gcollapse (sum) worker conv_equal_1 conv_missing conv_equal_0 conv_equal_other, by(date ciiu_4) fast unsorted

    save "${intermediate_data_emp}/ConvencionadosBy4DigitSector.dta", replace

}

log close
